started: AL13Jul2019
last updated: AL16Sep2019

Summary

Eigenvectors calculated from 293 variants (non-rara, not in LD)
for 3,216 samples = 2,504 kgen + 715 wecare-nfe (258UBC, 257CBC and 197NFE)

Start_section

Sys.time()
## [1] "2019-09-16 19:26:12 BST"
rm(list=ls())
graphics.off()

library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s12_joined_PCA/s06_explore_joined_PCA_plots"

opts_knit$set(root.dir = base_folder)

options(stringsAsFactors = F)
options(warnPartialMatchArgs = T, 
        warnPartialMatchAttr = T, 
        warnPartialMatchDollar = T)

#options(error = browser()) # Type Q or c to exit, drop browser level
# https://support.rstudio.com/hc/en-us/articles/200713843?version=1.1.456&mode=desktop
# https://stackoverflow.com/questions/13052522/how-to-leave-the-r-browser-mode-in-the-console-window/13052588 

Read_data

source_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s11_remove_BRCA_PALB_carriers"
load(paste(source_folder, "s03_exclude_BRCA1_BCRA2_PALB2_carriers.RData", sep="/"))

source_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s12_joined_PCA/s05_calculate_joined_1kgp_ampliseq_nfe_PCs/data/s04_pca"
eigenvectors_file <- paste(source_folder, "ampliseq_nfe_1kg_293_3216_100PCs.eigenvec", sep="/")
eigenvectors.df <- read.table(eigenvectors_file, header=T, sep="\t",quote="")
eigenvalues_file <- paste(source_folder, "ampliseq_nfe_1kg_293_3216_100PCs.eigenval", sep="/")
eigenvalues.df <- read.table(eigenvalues_file, header=F, sep="\t",quote="")

source_folder="/Users/alexey/Documents/resources/1kgp"
kg_phenotypes_file <- paste(source_folder, "integrated_call_samples_v3.20130502.ALL.panel", sep="/")
kg_phenotypes.df <- read.table(kg_phenotypes_file, header=T)

rm(source_folder, eigenvectors_file, eigenvalues_file, kg_phenotypes_file, genotypes.mx, variants.df)

Check data

ls()
## [1] "base_folder"      "eigenvalues.df"   "eigenvectors.df"  "kg_phenotypes.df" "phenotypes.df"
dim(eigenvectors.df)
## [1] 3216  102
dim(eigenvalues.df)
## [1] 100   1
dim(kg_phenotypes.df)
## [1] 2504    4
dim(phenotypes.df)
## [1] 712  24
table(phenotypes.df$cc)
## 
##  -1   0   1 
## 197 258 257

Update eigenvectors

eigenvectors.df[1:5,1:5]
##       FID     IID         PC1         PC2       PC3
## 1 HG00096 HG00096 -0.01346760 -0.01867100 0.0100275
## 2 HG00097 HG00097 -0.00802613 -0.00905261 0.0137784
## 3 HG00099 HG00099 -0.00888039 -0.00760967 0.0204807
## 4 HG00100 HG00100 -0.00824455 -0.01715050 0.0106874
## 5 HG00101 HG00101 -0.01323710 -0.01055170 0.0104988
rownames(eigenvectors.df) <- eigenvectors.df$FID
eigenvectors.df <- eigenvectors.df[,-1]
eigenvectors.df[1:5,1:5]
##             IID         PC1         PC2       PC3          PC4
## HG00096 HG00096 -0.01346760 -0.01867100 0.0100275 -0.018377300
## HG00097 HG00097 -0.00802613 -0.00905261 0.0137784 -0.021521600
## HG00099 HG00099 -0.00888039 -0.00760967 0.0204807  0.010075800
## HG00100 HG00100 -0.00824455 -0.01715050 0.0106874  0.006950090
## HG00101 HG00101 -0.01323710 -0.01055170 0.0104988 -0.000148708

Look at eigenvalues

plot(eigenvalues.df$V1, type="b", ylab="Variance",
     main="Top 100 eigenvectors")

plot(eigenvalues.df$V1[1:10], type="b", ylab="Variance",
     main="Top 10 eigenvectors")

Prepare data for Ampliseq-1kgp PCA plot (w/o re-called NFE samples)

Expected order of samples:

# Check the expected order of samples
3216 - 197
## [1] 3019
eigenvectors.df[c(2504,2505),c("IID","PC1")]
##                     IID         PC1
## NA21144         NA21144 -0.00774857
## 100_S8_L007 100_S8_L007 -0.01389610
eigenvectors.df[c(3019,3020),c("IID","PC1")]
##                     IID         PC1
## 9_S346_L008 9_S346_L008 -0.01309050
## 2:HG00097     2:HG00097 -0.00858656
# Prepare table with nfe IDs
nfe_pca <- eigenvectors.df$IID[3020:3216]
nfe_ampliseq <- sub("2:","",nfe_pca)
nfe.df <- data.frame(nfe_ampliseq, nfe_pca)

# Remove overlapping NFE from eigenvectors data
selected_samples <- ! eigenvectors.df$IID %in% nfe.df$nfe_pca
sum(selected_samples)
## [1] 3019
515+2504
## [1] 3019
eigenvectors_ampliseq_kgen.df <- eigenvectors.df[selected_samples,1:6]
"sample" -> colnames(eigenvectors_ampliseq_kgen.df)[1]

# Prepare phenotypes for ampliseq-kgen data
phenotypes.df[c(515:516),c(1,2)]
##                long_ids illumina_id
## 9_S346_L008 9_S346_L008        S346
## HG00097         HG00097        <NA>
phenotypes_ampliseq.df <- phenotypes.df[1:515,c("long_ids","cc")]
table(phenotypes_ampliseq.df$cc)
## 
##   0   1 
## 258 257
"WECARE" -> phenotypes_ampliseq.df$cc[phenotypes_ampliseq.df$cc==1]
"WECARE" -> phenotypes_ampliseq.df$cc[phenotypes_ampliseq.df$cc==0]
table(phenotypes_ampliseq.df$cc)
## 
## WECARE 
##    515
c("sample","group") -> colnames(phenotypes_ampliseq.df)

phenotypes_kgen.df <- kg_phenotypes.df[,c("sample","super_pop")]
c("sample","group") -> colnames(phenotypes_kgen.df)

phenotypes_ampliseq_kgen.df <- rbind(phenotypes_kgen.df,phenotypes_ampliseq.df)
table(phenotypes_ampliseq_kgen.df$group)
## 
##    AFR    AMR    EAS    EUR    SAS WECARE 
##    661    347    504    503    489    515
# Add eigenvectors to phenotypes
eigenphen_ampliseq_kgen.df <- full_join(
  phenotypes_ampliseq_kgen.df, eigenvectors_ampliseq_kgen.df, by="sample")
dim(eigenphen_ampliseq_kgen.df)
## [1] 3019    7
head(eigenphen_ampliseq_kgen.df)
##    sample group         PC1         PC2         PC3          PC4         PC5
## 1 HG00096   EUR -0.01346760 -0.01867100  0.01002750 -0.018377300 -0.04498510
## 2 HG00097   EUR -0.00802613 -0.00905261  0.01377840 -0.021521600 -0.01097730
## 3 HG00099   EUR -0.00888039 -0.00760967  0.02048070  0.010075800 -0.01307020
## 4 HG00100   EUR -0.00824455 -0.01715050  0.01068740  0.006950090 -0.02188560
## 5 HG00101   EUR -0.01323710 -0.01055170  0.01049880 -0.000148708 -0.02711300
## 6 HG00102   EUR -0.00800988 -0.00603888 -0.00810086 -0.010844000  0.00328548
tail(eigenphen_ampliseq_kgen.df)
##            sample  group        PC1          PC2         PC3         PC4         PC5
## 3014 95_S517_L008 WECARE -0.0131194  0.000875954  0.00179746 -0.01049060 -0.03627300
## 3015 96_S236_L007 WECARE -0.0104293 -0.009735540  0.01409190 -0.00250150 -0.01461430
## 3016 97_S509_L008 WECARE -0.0117182 -0.016958600  0.02451320 -0.00369948 -0.00244285
## 3017 98_S335_L008 WECARE -0.0117834 -0.011329700  0.00764198 -0.01465350  0.00850264
## 3018 99_S418_L008 WECARE -0.0136061 -0.017442000  0.00376609 -0.00200121 -0.00342837
## 3019  9_S346_L008 WECARE -0.0130905 -0.009823610 -0.00458896 -0.02066650  0.01425460
# Clean-up
rm(nfe_pca, nfe_ampliseq, selected_samples, eigenvectors_ampliseq_kgen.df, kg_phenotypes.df,
   phenotypes_ampliseq.df, phenotypes_ampliseq_kgen.df, nfe.df, phenotypes.df)

Wecare PCA plot

# Set outliers thresholds (manually selected on the basis of visual assessment of plots)
pc1_th <- 0 # 0.005
pc2_th <- 0.0075 # 0.01

# Prepare vector fr colour scale
myColours <- c("EUR"="BLUE", "AFR"="YELLOW", "AMR"="GREEN",
               "SAS"="GREY", "EAS"="PINK", 
               "WECARE"="RED")

myColourScale <- scale_colour_manual(values=myColours)

# Static plot
ggplot(eigenphen_ampliseq_kgen.df, aes(PC1,PC2)) + 
  geom_point(aes(col=group)) +
  labs(title="293 non-rare variants not in LD", x="PC1", y="PC2") +
  geom_vline(xintercept=pc1_th, linetype="dashed", size=0.5) +
  geom_hline(yintercept=pc2_th, linetype="dashed", size=0.5) +
  myColourScale

# Interactive plot
plotly_group <- factor(eigenphen_ampliseq_kgen.df$group,
  levels=c("AFR","AMR","EAS","SAS","EUR","WECARE"))

g <- ggplot(eigenphen_ampliseq_kgen.df, aes(PC1,PC2)) + 
  geom_point(aes(col=plotly_group, text=sample)) +
  labs(title="293 non-rare variants not in LD", x="PC1", y="PC2") +
  theme(legend.title=element_blank()) + # To suppress the legend title, otherwise it would be "plotly_group
  geom_vline(xintercept=pc1_th, linetype="dashed", size=0.5) +
  geom_hline(yintercept=pc2_th, linetype="dashed", size=0.5) +
  myColourScale  
## Warning: Ignoring unknown aesthetics: text
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates 
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(myColours, myColourScale, plotly_group, g)

Prepare data for recalled NFE - 1kgp PCA plot (w/o Ampliseq)

Expected order of samples:

# Check the expected order of samples
3216 - 197
## [1] 3019
eigenvectors.df[c(2504,2505),c("IID","PC1")]
##                     IID         PC1
## NA21144         NA21144 -0.00774857
## 100_S8_L007 100_S8_L007 -0.01389610
eigenvectors.df[c(3019,3020),c("IID","PC1")]
##                     IID         PC1
## 9_S346_L008 9_S346_L008 -0.01309050
## 2:HG00097     2:HG00097 -0.00858656
# Prepare table with ampliseq IDs
ampliseq_ids <- eigenvectors.df$IID[2505:3019]

# Remove ampliseq from eigenvectors data
selected_samples <- ! eigenvectors.df$IID %in% ampliseq_ids
sum(selected_samples)
## [1] 2701
197+2504
## [1] 2701
eigenvectors_nfe_kgen.df <- eigenvectors.df[selected_samples,1:6]
"sample" -> colnames(eigenvectors_nfe_kgen.df)[1]

# Prepare phenotypes for ampliseq-kgen data
eigenvectors_nfe_kgen.df$sample[2504:2505]
## [1] "NA21144"   "2:HG00097"
phenotypes_nfe.df <- data.frame(sample=eigenvectors_nfe_kgen.df$sample[2505:2701], group="Re-processed NFE")

phenotypes_nfe_kgen.df <- rbind(phenotypes_kgen.df,phenotypes_nfe.df)
table(phenotypes_nfe_kgen.df$group)
## 
##              AFR              AMR              EAS              EUR Re-processed NFE              SAS 
##              661              347              504              503              197              489
# Add eigenvectors to phenotypes
eigenphen_nfe_kgen.df <- full_join(
  phenotypes_nfe_kgen.df, eigenvectors_nfe_kgen.df, by="sample")
dim(eigenphen_nfe_kgen.df)
## [1] 2701    7
head(eigenphen_nfe_kgen.df)
##    sample group         PC1         PC2         PC3          PC4         PC5
## 1 HG00096   EUR -0.01346760 -0.01867100  0.01002750 -0.018377300 -0.04498510
## 2 HG00097   EUR -0.00802613 -0.00905261  0.01377840 -0.021521600 -0.01097730
## 3 HG00099   EUR -0.00888039 -0.00760967  0.02048070  0.010075800 -0.01307020
## 4 HG00100   EUR -0.00824455 -0.01715050  0.01068740  0.006950090 -0.02188560
## 5 HG00101   EUR -0.01323710 -0.01055170  0.01049880 -0.000148708 -0.02711300
## 6 HG00102   EUR -0.00800988 -0.00603888 -0.00810086 -0.010844000  0.00328548
tail(eigenphen_nfe_kgen.df)
##         sample            group         PC1        PC2          PC3          PC4          PC5
## 2696 2:NA20819 Re-processed NFE -0.01013920 -0.0153473 -0.006202510 -0.014849900 -0.030602700
## 2697 2:NA20821 Re-processed NFE -0.00804785 -0.0161630 -0.000549097  0.000697943  0.000173721
## 2698 2:NA20822 Re-processed NFE -0.01064120 -0.0206488 -0.005233230 -0.035010200 -0.000166197
## 2699 2:NA20826 Re-processed NFE -0.01126070 -0.0204990 -0.013083800 -0.006576950 -0.015059900
## 2700 2:NA20828 Re-processed NFE -0.01389330 -0.0263133 -0.004199550 -0.027727600  0.003555860
## 2701 2:NA20832 Re-processed NFE -0.00624664 -0.0153104 -0.018062800 -0.006331740  0.012722500
# Clean-up
rm(ampliseq_ids, selected_samples, eigenvectors_nfe_kgen.df, 
   phenotypes_nfe.df, phenotypes_nfe_kgen.df, phenotypes_kgen.df)

Re-called NFE PCA

# Prepare vector fr colour scale
myColours <- c("EUR"="BLUE", "AFR"="YELLOW", "AMR"="GREEN",
               "SAS"="GREY", "EAS"="PINK", 
               "Re-processed NFE"="CYAN")

myColourScale <- scale_colour_manual(values=myColours)

# Static plot
ggplot(eigenphen_nfe_kgen.df, aes(PC1,PC2)) + 
  geom_point(aes(col=group)) +
  labs(title="293 non-rare variants not in LD", x="PC1", y="PC2") +
  geom_vline(xintercept=pc1_th, linetype="dashed", size=0.5) +
  geom_hline(yintercept=pc2_th, linetype="dashed", size=0.5) +
  myColourScale

# Interactive plot
plotly_group <- factor(eigenphen_nfe_kgen.df$group,
  levels=c("AFR","AMR","EAS","SAS","EUR","Re-processed NFE"))

g <- ggplot(eigenphen_nfe_kgen.df, aes(PC1,PC2)) + 
  geom_point(aes(col=plotly_group, text=sample)) +
  labs(title="293 non-rare variants not in LD", x="PC1", y="PC2") +
  theme(legend.title=element_blank()) + # To suppress the legend title, otherwise it would be "plotly_group
  geom_vline(xintercept=pc1_th, linetype="dashed", size=0.5) +
  geom_hline(yintercept=pc2_th, linetype="dashed", size=0.5) +
  myColourScale  
## Warning: Ignoring unknown aesthetics: text
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates 
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(pc1_th, pc2_th, myColours, myColourScale, plotly_group, g)

Save data

save.image(paste(base_folder, "s01_joined_PCA_plots_293_3216_not_rare_not_in_LD.RData", sep="/"))

Final_section

ls()
## [1] "base_folder"                "eigenphen_ampliseq_kgen.df" "eigenphen_nfe_kgen.df"      "eigenvalues.df"             "eigenvectors.df"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] plotly_4.9.0  ggplot2_3.2.0 dplyr_0.8.1   knitr_1.23   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.1        later_0.8.0       pillar_1.4.1      compiler_3.5.1    tools_3.5.1       digest_0.6.19     jsonlite_1.6      evaluate_0.14     tibble_2.1.3      gtable_0.3.0      viridisLite_0.3.0 pkgconfig_2.0.2   rlang_0.3.4       shiny_1.3.2       crosstalk_1.0.0   yaml_2.2.0        xfun_0.7          withr_2.1.2       stringr_1.4.0     httr_1.4.0        htmlwidgets_1.3   grid_3.5.1        tidyselect_0.2.5  glue_1.3.1        data.table_1.12.2 R6_2.4.0          rmarkdown_1.13    purrr_0.3.2       tidyr_0.8.3       magrittr_1.5      promises_1.0.1    scales_1.0.0      htmltools_0.3.6   assertthat_0.2.1  xtable_1.8-4      mime_0.7          colorspace_1.4-1  httpuv_1.5.1      labeling_0.3      stringi_1.4.3     lazyeval_0.2.2    munsell_0.5.0     crayon_1.3.4
Sys.time()
## [1] "2019-09-16 19:26:17 BST"